{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "Python 3.7.7 64-bit ('st': conda)",
   "display_name": "Python 3.7.7 64-bit ('st': conda)",
   "metadata": {
    "interpreter": {
     "hash": "cd15d060c920a31a2004e0f1a0ebfcce168f4cda13577a5cbc6fc38473eace97"
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_translation_datasets(data_path):\n",
    "    with open(os.path.join(data_path, \"train.trg\"), \"r\", encoding=\"utf-8\") as f:\n",
    "        sinhala_text = f.readlines()\n",
    "        sinhala_text = [text.strip(\"\\n\") for text in sinhala_text]\n",
    "\n",
    "    with open(os.path.join(data_path, \"train.src\"), \"r\") as f:\n",
    "        english_text = f.readlines()\n",
    "        english_text = [text.strip(\"\\n\") for text in english_text]\n",
    "\n",
    "    data = []\n",
    "    for sinhala, english in zip(sinhala_text, english_text):\n",
    "        data.append([\"translate sinhala to english\", sinhala, english])\n",
    "        data.append([\"translate english to sinhala\", english, sinhala])\n",
    "\n",
    "    train_df = pd.DataFrame(data, columns=[\"prefix\", \"input_text\", \"target_text\"])\n",
    "\n",
    "    with open(os.path.join(data_path, \"test.trg\"), \"r\", encoding=\"utf-8\") as f:\n",
    "        sinhala_text = f.readlines()\n",
    "        sinhala_text = [text.strip(\"\\n\") for text in sinhala_text]\n",
    "\n",
    "    with open(os.path.join(data_path, \"test.src\"), \"r\") as f:\n",
    "        english_text = f.readlines()\n",
    "        english_text = [text.strip(\"\\n\") for text in english_text]\n",
    "\n",
    "    data = []\n",
    "    for sinhala, english in zip(sinhala_text, english_text):\n",
    "        data.append([\"translate sinhala to english\", sinhala, english])\n",
    "        data.append([\"translate english to sinhala\", english, sinhala])\n",
    "\n",
    "    eval_df = pd.DataFrame(data, columns=[\"prefix\", \"input_text\", \"target_text\"])\n",
    "\n",
    "    return train_df, eval_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df, eval_df = prepare_translation_datasets(\"data/eng-sin\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "                               prefix  \\\n",
       "0        translate sinhala to english   \n",
       "1        translate english to sinhala   \n",
       "2        translate sinhala to english   \n",
       "3        translate english to sinhala   \n",
       "4        translate sinhala to english   \n",
       "...                               ...   \n",
       "2255049  translate english to sinhala   \n",
       "2255050  translate sinhala to english   \n",
       "2255051  translate english to sinhala   \n",
       "2255052  translate sinhala to english   \n",
       "2255053  translate english to sinhala   \n",
       "\n",
       "                                                input_text  \\\n",
       "0                                    මෙය සිදු වන්නේ කවදාද?   \n",
       "1                                   When will this happen?   \n",
       "2                                                 දුවන්න !   \n",
       "3                                                     Run!   \n",
       "4                                                - අනිද්දට   \n",
       "...                                                    ...   \n",
       "2255049  As those words indicate, some Christians will ...   \n",
       "2255050                             එතකොට ඔයා මොකද කරන්නේ?   \n",
       "2255051                           Then what are you to do?   \n",
       "2255052  ඔහු ජාන විද්‍යාඥයෙක්, මානව ජානවිද්‍යාව ගැන විශ...   \n",
       "2255053  Only that he's a genecist, an expert on human ...   \n",
       "\n",
       "                                               target_text  \n",
       "0                                   When will this happen?  \n",
       "1                                    මෙය සිදු වන්නේ කවදාද?  \n",
       "2                                                     Run!  \n",
       "3                                                 දුවන්න !  \n",
       "4                                - The day after tomorrow.  \n",
       "...                                                    ...  \n",
       "2255049  ඒ විරුද්ධවාදිකම් ආගමික නැත්නම් දේශපාලන නයකයන්ග...  \n",
       "2255050                           Then what are you to do?  \n",
       "2255051                             එතකොට ඔයා මොකද කරන්නේ?  \n",
       "2255052  Only that he's a genecist, an expert on human ...  \n",
       "2255053  ඔහු ජාන විද්‍යාඥයෙක්, මානව ජානවිද්‍යාව ගැන විශ...  \n",
       "\n",
       "[2255054 rows x 3 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>prefix</th>\n      <th>input_text</th>\n      <th>target_text</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>translate sinhala to english</td>\n      <td>මෙය සිදු වන්නේ කවදාද?</td>\n      <td>When will this happen?</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>translate english to sinhala</td>\n      <td>When will this happen?</td>\n      <td>මෙය සිදු වන්නේ කවදාද?</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>translate sinhala to english</td>\n      <td>දුවන්න !</td>\n      <td>Run!</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>translate english to sinhala</td>\n      <td>Run!</td>\n      <td>දුවන්න !</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>translate sinhala to english</td>\n      <td>- අනිද්දට</td>\n      <td>- The day after tomorrow.</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>2255049</th>\n      <td>translate english to sinhala</td>\n      <td>As those words indicate, some Christians will ...</td>\n      <td>ඒ විරුද්ධවාදිකම් ආගමික නැත්නම් දේශපාලන නයකයන්ග...</td>\n    </tr>\n    <tr>\n      <th>2255050</th>\n      <td>translate sinhala to english</td>\n      <td>එතකොට ඔයා මොකද කරන්නේ?</td>\n      <td>Then what are you to do?</td>\n    </tr>\n    <tr>\n      <th>2255051</th>\n      <td>translate english to sinhala</td>\n      <td>Then what are you to do?</td>\n      <td>එතකොට ඔයා මොකද කරන්නේ?</td>\n    </tr>\n    <tr>\n      <th>2255052</th>\n      <td>translate sinhala to english</td>\n      <td>ඔහු ජාන විද්‍යාඥයෙක්, මානව ජානවිද්‍යාව ගැන විශ...</td>\n      <td>Only that he's a genecist, an expert on human ...</td>\n    </tr>\n    <tr>\n      <th>2255053</th>\n      <td>translate english to sinhala</td>\n      <td>Only that he's a genecist, an expert on human ...</td>\n      <td>ඔහු ජාන විද්‍යාඥයෙක්, මානව ජානවිද්‍යාව ගැන විශ...</td>\n    </tr>\n  </tbody>\n</table>\n<p>2255054 rows × 3 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "source": [
    "train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_csv(\"data/train.tsv\", sep=\"\\t\")\n",
    "eval_df.to_csv(\"data/eval.tsv\", sep=\"\\t\")"
   ]
  }
 ]
}